knitr::opts_chunk$set(echo = TRUE)
dsuaz <- read.csv("datasetUAZ.csv", stringsAsFactors = FALSE)
str(dsuaz)
## 'data.frame': 113 obs. of 24 variables:
## $ hrs_estudio_dia : int 4 4 0 4 4 4 4 4 0 6 ...
## $ hrs_tv_dia : int 0 2 0 0 4 0 0 0 2 4 ...
## $ hrs_vjuegos_dia : int 4 0 0 0 4 0 0 0 2 0 ...
## $ hrs_actsocial_dia : int 2 1 2 1 0 3 0 1 1 0 ...
## $ hrs_universidad_dia : int 7 8 4 7 7 6 7 6 8 7 ...
## $ promedio : num 8 8.7 9.6 8.2 8.5 9.3 8 7.6 7.2 7 ...
## $ hrs_ejercicio_dia : int 0 0 4 0 4 0 0 0 4 0 ...
## $ hrs_dormir_dia : int 8 6 8 6 8 8 8 8 6 6 ...
## $ transporte_publico : int 1 1 0 1 1 1 1 1 0 1 ...
## $ comedor : int 1 1 0 1 0 1 0 0 0 1 ...
## $ becario : int 1 1 1 1 0 1 0 0 0 0 ...
## $ materias_repetidas : int 0 0 0 0 1 0 1 5 2 0 ...
## $ dificultad_licenciatura: int 3 2 2 4 3 2 4 3 2 4 ...
## $ pasion_licenciatura : int 8 9 8 9 9 10 10 7 6 7 ...
## $ oferta_laboral : int 5 8 9 10 9 4 10 9 8 5 ...
## $ Ssalario_estimado : chr "10 - 15 mil" "15 - 20 mil" "mas de 20 mil" "6 - 10 mil" ...
## $ hrs_trabajo : int 10 10 10 12 8 8 12 8 12 10 ...
## $ municipio : chr "Morelos" "Ojocaliente" "Orizaba" "Naucalpan" ...
## $ genero : chr "Hombre" "Hombre" "Hombre" "Mujer" ...
## $ edad : int 18 20 21 19 20 22 19 25 22 18 ...
## $ licenciatura : chr "Ingenieria de Software" "Quimico Farmaceutico Biologo" "Ingenieria de Software" "Medicina Humana" ...
## $ semestre : int 2 1 8 2 4 10 2 8 8 2 ...
## $ planes_egreso : chr "Posgrado" "Laborar" "Posgrado" "Posgrado" ...
## $ X : logi NA NA NA NA NA NA ...
head(dsuaz)
## hrs_estudio_dia hrs_tv_dia hrs_vjuegos_dia hrs_actsocial_dia
## 1 4 0 4 2
## 2 4 2 0 1
## 3 0 0 0 2
## 4 4 0 0 1
## 5 4 4 4 0
## 6 4 0 0 3
## hrs_universidad_dia promedio hrs_ejercicio_dia hrs_dormir_dia
## 1 7 8.0 0 8
## 2 8 8.7 0 6
## 3 4 9.6 4 8
## 4 7 8.2 0 6
## 5 7 8.5 4 8
## 6 6 9.3 0 8
## transporte_publico comedor becario materias_repetidas
## 1 1 1 1 0
## 2 1 1 1 0
## 3 0 0 1 0
## 4 1 1 1 0
## 5 1 0 0 1
## 6 1 1 1 0
## dificultad_licenciatura pasion_licenciatura oferta_laboral
## 1 3 8 5
## 2 2 9 8
## 3 2 8 9
## 4 4 9 10
## 5 3 9 9
## 6 2 10 4
## Ssalario_estimado hrs_trabajo municipio genero edad
## 1 10 - 15 mil 10 Morelos Hombre 18
## 2 15 - 20 mil 10 Ojocaliente Hombre 20
## 3 mas de 20 mil 10 Orizaba Hombre 21
## 4 6 - 10 mil 12 Naucalpan Mujer 19
## 5 15 - 20 mil 8 Huejuquilla el Alto Mujer 20
## 6 6 - 10 mil 8 Panfilo natera Mujer 22
## licenciatura semestre planes_egreso X
## 1 Ingenieria de Software 2 Posgrado NA
## 2 Quimico Farmaceutico Biologo 1 Laborar NA
## 3 Ingenieria de Software 8 Posgrado NA
## 4 Medicina Humana 2 Posgrado NA
## 5 Ingenieria de Software 4 Laborar NA
## 6 Psicologia 10 Posgrado NA
tail(dsuaz)
## hrs_estudio_dia hrs_tv_dia hrs_vjuegos_dia hrs_actsocial_dia
## 108 4 4 0 1
## 109 4 4 4 3
## 110 4 4 0 2
## 111 4 0 0 3
## 112 4 4 0 2
## 113 4 2 0 2
## hrs_universidad_dia promedio hrs_ejercicio_dia hrs_dormir_dia
## 108 7 9.30 0 8
## 109 4 8.30 0 8
## 110 6 8.00 4 8
## 111 6 8.57 4 8
## 112 6 9.50 0 8
## 113 6 8.70 2 6
## transporte_publico comedor becario materias_repetidas
## 108 1 1 0 0
## 109 1 1 0 0
## 110 1 0 0 0
## 111 1 1 1 0
## 112 1 1 0 0
## 113 1 0 0 0
## dificultad_licenciatura pasion_licenciatura oferta_laboral
## 108 3 10 5
## 109 3 9 10
## 110 2 9 0
## 111 2 8 10
## 112 2 5 5
## 113 3 9 8
## Ssalario_estimado hrs_trabajo municipio genero edad
## 108 6 - 10 mil 8 Zacatecas Mujer 22
## 109 10 - 15 mil 8 Jerez Hombre 22
## 110 6 - 10 mil 8 Zacatecas Mujer 19
## 111 15 - 20 mil 10 Panuco Hombre 22
## 112 10 - 15 mil 10 Guadalupe Mujer 22
## 113 15 - 20 mil 8 Huejuquilla el Alto Hombre 23
## licenciatura semestre planes_egreso X
## 108 Psicologia 10 Laborar NA
## 109 Ingenieria de Software 8 Posgrado NA
## 110 Psicologia 2 Posgrado NA
## 111 Ingenieria de Software 8 Posgrado NA
## 112 Psicologia 10 Posgrado NA
## 113 Quimico Farmaceutico Biologo 8 Laborar NA
summary(dsuaz)
## hrs_estudio_dia hrs_tv_dia hrs_vjuegos_dia hrs_actsocial_dia
## Min. :0.000 Min. :0.000 Min. :0.000 Min. : 0.000
## 1st Qu.:4.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.: 1.000
## Median :4.000 Median :0.000 Median :0.000 Median : 2.000
## Mean :4.248 Mean :1.593 Mean :1.168 Mean : 2.248
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.: 3.000
## Max. :8.000 Max. :6.000 Max. :6.000 Max. :10.000
## hrs_universidad_dia promedio hrs_ejercicio_dia hrs_dormir_dia
## Min. :0.000 Min. :7.000 Min. :0.000 Min. : 3.000
## 1st Qu.:6.000 1st Qu.:8.000 1st Qu.:0.000 1st Qu.: 6.000
## Median :7.000 Median :8.400 Median :0.000 Median : 8.000
## Mean :6.301 Mean :8.423 Mean :1.327 Mean : 6.973
## 3rd Qu.:7.000 3rd Qu.:8.980 3rd Qu.:4.000 3rd Qu.: 8.000
## Max. :8.000 Max. :9.800 Max. :4.000 Max. :10.000
## transporte_publico comedor becario materias_repetidas
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.8053 Mean :0.4956 Mean :0.3186 Mean :0.4071
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :5.0000
## dificultad_licenciatura pasion_licenciatura oferta_laboral
## Min. :1.000 Min. : 4.000 Min. : 0.000
## 1st Qu.:2.000 1st Qu.: 8.000 1st Qu.: 5.000
## Median :2.000 Median : 9.000 Median : 8.000
## Mean :2.549 Mean : 8.735 Mean : 7.097
## 3rd Qu.:3.000 3rd Qu.:10.000 3rd Qu.: 9.000
## Max. :4.000 Max. :10.000 Max. :10.000
## Ssalario_estimado hrs_trabajo municipio genero
## Length:113 Min. : 8.000 Length:113 Length:113
## Class :character 1st Qu.: 8.000 Class :character Class :character
## Mode :character Median :10.000 Mode :character Mode :character
## Mean : 9.637
## 3rd Qu.:10.000
## Max. :13.000
## edad licenciatura semestre planes_egreso
## Min. :18.00 Length:113 Min. : 1.000 Length:113
## 1st Qu.:19.00 Class :character 1st Qu.: 2.000 Class :character
## Median :21.00 Mode :character Median : 6.000 Mode :character
## Mean :20.99 Mean : 6.044
## 3rd Qu.:22.00 3rd Qu.: 8.000
## Max. :28.00 Max. :10.000
## X
## Mode:logical
## NA's:113
##
##
##
##
table(dsuaz$genero)
##
## Hombre Mujer
## 51 62
table(dsuaz$edad)
##
## 18 19 20 21 22 23 24 25 26 28
## 13 19 11 23 23 15 6 1 1 1
summary(dsuaz$edad)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 19.00 21.00 20.99 22.00 28.00
table(dsuaz$municipio)
##
## Calera Concepcion del Oro Fresnillo
## 9 2 7
## Guadalupe Huejuquilla el Alto Jerez
## 11 2 19
## Mazapil Morelos Naucalpan
## 1 2 2
## Ojocaliente Orizaba Panfilo natera
## 2 1 1
## Panuco Pinos Rio Grande
## 5 2 2
## San Luis Potosi Sombrerete Susticacan
## 1 2 1
## Tabasco Tepetongo Tlaltenago Zac
## 2 3 2
## Torreon Valparaiso Zacatecas
## 1 1 32
table(dsuaz$hrs_estudio_dia)
##
## 0 4 6 8
## 8 81 18 6
summary(dsuaz$hrs_estudio_dia)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 4.000 4.000 4.248 4.000 8.000
table(dsuaz$hrs_tv_dia)
##
## 0 2 4 6
## 64 9 39 1
summary(dsuaz$hrs_tv_dia)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 1.593 4.000 6.000
table(dsuaz$hrs_ejercicio_dia)
##
## 0 2 4
## 74 3 36
summary(dsuaz$hrs_ejercicio_dia)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 1.327 4.000 4.000
table(dsuaz$hrs_universidad_dia)
##
## 0 4 6 7 8
## 1 13 41 50 8
summary(dsuaz$hrs_universidad_dia)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 6.000 7.000 6.301 7.000 8.000
table(dsuaz$hrs_dormir_dia)
##
## 3 6 8 10
## 8 42 59 4
summary(dsuaz$hrs_dormir_dia)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 6.000 8.000 6.973 8.000 10.000
table(dsuaz$transporte_publico)
##
## 0 1
## 22 91
summary(dsuaz$transporte_publico)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 1.0000 1.0000 0.8053 1.0000 1.0000
table(dsuaz$comedor)
##
## 0 1
## 57 56
summary(dsuaz$comedor)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.4956 1.0000 1.0000
table(dsuaz$becario)
##
## 0 1
## 77 36
summary(dsuaz$becario)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.3186 1.0000 1.0000
table(dsuaz$licenciatura)
##
## Ingenieria de Software Medicina Humana
## 33 30
## Psicologia Quimico Farmaceutico Biologo
## 25 25
table(dsuaz$semestre)
##
## 1 2 3 4 5 6 7 8 9 10
## 4 28 1 11 4 9 1 27 1 27
summary(dsuaz$semestre)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 6.000 6.044 8.000 10.000
table(dsuaz$dificultad_licenciatura)
##
## 1 2 3 4
## 3 55 45 10
summary(dsuaz$dificultad_licenciatura)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 2.000 2.549 3.000 4.000
table(dsuaz$pasion_licenciatura)
##
## 4 5 6 7 8 9 10
## 2 3 3 9 21 35 40
summary(dsuaz$pasion_licenciatura)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.000 8.000 9.000 8.735 10.000 10.000
table(dsuaz$materias_repetidas)
##
## 0 1 2 3 4 5
## 92 10 4 2 3 2
summary(dsuaz$materias_repetidas)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.4071 0.0000 5.0000
table(dsuaz$promedio)
##
## 7 7.2 7.5 7.6 7.7 7.8 8 8.01 8.09 8.13 8.14 8.2 8.3 8.31 8.4
## 4 3 2 5 1 3 21 1 1 1 1 7 4 1 2
## 8.5 8.57 8.6 8.7 8.73 8.8 8.9 8.98 9 9.1 9.2 9.3 9.4 9.5 9.6
## 11 1 3 2 1 4 5 1 10 2 4 4 1 1 2
## 9.7 9.77 9.78 9.8
## 1 1 1 1
summary(dsuaz$promedio)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.000 8.000 8.400 8.423 8.980 9.800
table(dsuaz$oferta_laboral)
##
## 0 1 2 3 4 5 6 7 8 9 10
## 2 2 1 2 7 20 5 16 20 18 20
summary(dsuaz$oferta_laboral)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 5.000 8.000 7.097 9.000 10.000
table(dsuaz$Ssalario_estimado)
##
## 10 - 15 mil 15 - 20 mil 6 - 10 mil mas de 20 mil
## 32 30 28 23
table(dsuaz$planes_egreso)
##
## Laborar Posgrado
## 52 61
boxplot(dsuaz$promedio, main="GPA of each student", col='#2999AD', ylab="Grade")
boxplot(dsuaz$pasion_licenciatura, main = "Passion of students for their career", col = "#FFC700", ylab = "Passion")
#### Job offer expectation ###### Here the results range from 0 to 10, across the whole scale. ###### The median is 8, and the first and third quartiles are between 5 and 9, which indicates that the majority of students qualify their expectation of labor supply among this range.
boxplot(dsuaz$oferta_laboral, main="Job offer expectation", col="#A30F2E", ylab="Expectation")
boxplot(dsuaz$edad, main="Students age", col = "#66B21A", ylab="Age")
boxplot(split(dsuaz$hrs_estudio_dia,dsuaz$genero),las = 2,main=toupper('Hours of Study according to the gender'), font.main=3, cex.main=1.2, ylab="Hours of study", font.lab=3, col='#FFBD59', names = c('Male','Female'))
boxplot(split(dsuaz$hrs_dormir_dia,dsuaz$genero),las = 2,main=toupper('Hours of Sleep according to the gender'), font.main=3, cex.main=1.2, ylab="Hours of Sleep", font.lab=3, col='#FF2E47', names = c('Male','Female'))
boxplot(split(dsuaz$hrs_ejercicio_dia,dsuaz$genero),las = 2,main=toupper('Hours of Exercise according to the gender'), font.main=3, cex.main=1.2, ylab="Hours of Exercise", font.lab=3, col='#8CBA5C', names = c('Male','Female'))
boxplot(split(dsuaz$hrs_vjuegos_dia,dsuaz$genero),las = 2,main=toupper('Hours of Video Games according to the gender'), font.main=3, cex.main=1.2, ylab="Hours of Video Games", font.lab=3, col='#B266A6', names = c('Male','Female'))
boxplot(split(dsuaz$hrs_tv_dia,dsuaz$genero),las = 2,main=toupper('Hours of TV according to the gender'), font.main=3, cex.main=1.2, ylab="Hours of TV", font.lab=3, col='#6E6E9C', names = c('Male','Female'))
boxplot(dsuaz$hrs_dormir_dia, main="Sleep hours for each student", ylab = "Sleepy hours", col = "#FF2185")
boxplot(split(dsuaz$hrs_dormir_dia,dsuaz$edad),main=toupper('Hours of sleep according to age'), font.main=3, cex.main=1.2, xlab="Age", ylab="Hours of sleep", font.lab=3, col="#31B1B1")
boxplot(split(dsuaz$hrs_dormir_dia,dsuaz$licenciatura),las = 2,main=toupper('Hours of Sleep according to the career'), font.main=3, cex.main=1.2, ylab="Hours of sleep", font.lab=3, col='#409973', names = c('Software','Medicine','Psychology','QFB'))
boxplot(dsuaz$hrs_estudio_dia, xlab="Study hours each day")
boxplot(split(dsuaz$hrs_estudio_dia,dsuaz$edad),main=toupper('Hours of Study according to age'), font.main=3, cex.main=1.2, xlab="Age", ylab="Hours of study", font.lab=3, col="#0A8FA3")
boxplot(split(dsuaz$hrs_estudio_dia,dsuaz$semestre),main=toupper('Hours of Study according to the semester'), font.main=3, cex.main=1.2, xlab="Semester", ylab="Hours of study", font.lab=3, col="#1A80B2")
boxplot(split(dsuaz$hrs_estudio_dia,dsuaz$licenciatura),las = 2,main=toupper('Hours of Study according to the career'), font.main=3, cex.main=1.2, ylab="Hours of study", font.lab=3, col='#FFBD59', names = c('Software','Medicine','Psychology','QFB'))
### Career boxplots
boxplot(split(dsuaz$hrs_universidad_dia,dsuaz$licenciatura),las = 2,main=toupper('Hours at University according to the career'), font.main=3, cex.main=1.2, ylab="Hours at university", font.lab=3, col='#FF8A36', names = c('Software','Medicine','Psychology','QFB'))
boxplot(split(dsuaz$dificultad_licenciatura,dsuaz$licenciatura),las = 2,main=toupper('Difficulty appreciated according to the career'), font.main=3, cex.main=1.2, ylab="0 = Min, 4 = Max", font.lab=3, col='#FA668A', names = c('Software','Medicine','Psychology','QFB'))
boxplot(split(dsuaz$materias_repetidas,dsuaz$licenciatura),las = 2,main=toupper('Repeated subjects according to the career'), font.main=3, cex.main=1.2, ylab="Repeated subjects", font.lab=3, col='#2E61DB', names = c('Software','Medicine','Psychology','QFB'))
boxplot(split(dsuaz$pasion_licenciatura,dsuaz$licenciatura),las = 2,main=toupper('Passion according to the career'), font.main=3, cex.main=1.2, ylab="0 = Min, 10 = Max", font.lab=3, col='#B24073', names = c('Software','Medicine','Psychology','QFB'))
boxplot(split(dsuaz$semestre,dsuaz$licenciatura),las = 2,main=toupper('Semester according to the career'), font.main=3, cex.main=1.2, ylab="Semester", font.lab=3, col='#FF99A3', names = c('Software','Medicine','Psychology','QFB'))
boxplot(split(dsuaz$promedio,dsuaz$licenciatura),las = 2,main=toupper('GPA according to the career'), font.main=3, cex.main=1.2, ylab="GPA", font.lab=3, col='#69BD8A', names = c('Software','Medicine','Psychology','QFB'))
boxplot(split(dsuaz$semestre,dsuaz$licenciatura),las = 2,main=toupper('Job offer expectation according to the career'), font.main=3, cex.main=1.2, ylab="0 = Min, 10 = Max", font.lab=3, col='#992966', names = c('Software','Medicine','Psychology','QFB'))
boxplot(split(dsuaz$semestre,dsuaz$licenciatura),las = 2,main=toupper('Hours of work expected according to the career'), font.main=3, cex.main=1.2, ylab="Hours of work expected", font.lab=3, col='#1F8594', names = c('Software','Medicine','Psychology','QFB'))
mean(dsuaz$hrs_estudio_dia)
## [1] 4.247788
median(dsuaz$hrs_estudio_dia)
## [1] 4
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
getmode(dsuaz$hrs_estudio_dia)
## [1] 4
library(e1071)
skewness(dsuaz$hrs_estudio_dia)
## [1] -0.428141
hist(dsuaz$hrs_estudio_dia,
main = "study hours per day",
xlab = "Hours",
ylab = "Students",
labels = TRUE,
col ="#FFB852")
hist(dsuaz$hrs_estudio_dia,
xlim=c(min(dsuaz$hrs_estudio_dia),max(dsuaz$hrs_estudio_dia)), probability=T, nclass=max(dsuaz$hrs_estudio_dia)-min(dsuaz$hrs_estudio_dia)+1,
col='#99CC00',
main = "Symmetric Skewed",
xlab = "",
ylab = "",
axes=FALSE)
lines(density(dsuaz$hrs_estudio_dia,bw=1), col='#FF0038', lwd=6)
mean(dsuaz$promedio)
## [1] 8.423097
median(dsuaz$promedio)
## [1] 8.4
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
getmode(dsuaz$promedio)
## [1] 8
library(e1071)
skewness(dsuaz$promedio)
## [1] -0.03446313
hist(dsuaz$promedio,
main = "GPAs",
xlab = "GPA",
ylab = "Students",
labels = TRUE,
col = "#BA0066")
hist(dsuaz$promedio,
xlim=c(min(dsuaz$promedio),max(dsuaz$promedio)), probability=T, nclass=max(dsuaz$promedio)-min(dsuaz$promedio)+1,
col='#6B9196',
main = "Symmetric Skewed",
xlab = "",
ylab = "",
axes=FALSE)
lines(density(dsuaz$promedio,bw=1), col='#FF0038', lwd=6)
mean(dsuaz$hrs_dormir_dia)
## [1] 6.973451
median(dsuaz$hrs_dormir_dia)
## [1] 8
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
getmode(dsuaz$hrs_dormir_dia)
## [1] 8
library(e1071)
skewness(dsuaz$hrs_dormir_dia)
## [1] -0.8900936
hist(dsuaz$hrs_dormir_dia,
main = "Histogram of qualification of each student",
xlab = "Qualification",
ylab = "Students quantity",
labels = TRUE,
col ="#1FA8DB")
hist(dsuaz$hrs_dormir_dia,
xlim=c(min(dsuaz$hrs_dormir_dia),max(dsuaz$hrs_dormir_dia)), probability=T, nclass=max(dsuaz$hrs_dormir_dia)-min(dsuaz$hrs_dormir_dia)+1,
col='#009999',
main = "Negative Skewed",
xlab = "",
ylab = "",
axes=FALSE)
lines(density(dsuaz$hrs_dormir_dia,bw=1), col='#FF0038', lwd=6)
mean(dsuaz$pasion_licenciatura)
## [1] 8.734513
median(dsuaz$pasion_licenciatura)
## [1] 9
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
getmode(dsuaz$pasion_licenciatura)
## [1] 10
library(e1071)
skewness(dsuaz$pasion_licenciatura)
## [1] -1.351363
hist(dsuaz$pasion_licenciatura,
main = "Histogram of the students pasion/love with his/her career",
xlab = "Pasion",
ylab = "Students quantity",
labels = TRUE,
col="#3D9E6B")
hist(dsuaz$pasion_licenciatura,
xlim=c(min(dsuaz$pasion_licenciatura),max(dsuaz$pasion_licenciatura)), probability=T, nclass=max(dsuaz$pasion_licenciatura)-min(dsuaz$pasion_licenciatura)+1,
col='#FFA342',
main = "Negative Skewed",
xlab = "",
ylab = "",
axes=FALSE)
lines(density(dsuaz$pasion_licenciatura,bw=1), col='#FF0038', lwd=6)
mean(dsuaz$oferta_laboral)
## [1] 7.097345
median(dsuaz$oferta_laboral)
## [1] 8
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
getmode(dsuaz$oferta_laboral)
## [1] 5
library(e1071)
skewness(dsuaz$oferta_laboral)
## [1] -0.7729701
hist(dsuaz$oferta_laboral,
main = "Histogram of the difficulty to get a job with his/her career",
xlab = "difficulty",
ylab = "Students quantity",
labels = TRUE,
col ="#66A640")
hist(dsuaz$oferta_laboral,
xlim=c(min(dsuaz$oferta_laboral),max(dsuaz$oferta_laboral)), probability=T, nclass=max(dsuaz$oferta_laboral)-min(dsuaz$oferta_laboral)+1,
col='#FF6666',
main = "Negative Skewed",
xlab = "",
ylab = "",
axes=FALSE)
lines(density(dsuaz$oferta_laboral,bw=1), col='#FF0038', lwd=6)
mean(dsuaz$edad)
## [1] 20.99115
median(dsuaz$edad)
## [1] 21
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
getmode(dsuaz$edad)
## [1] 21
library(e1071)
skewness(dsuaz$edad)
## [1] 0.3920733
hist(dsuaz$edad,
main = "Histogram of the student age",
xlab = "Age",
ylab = "Students quantity",
labels = TRUE,
col ="#2E5CC7")
hist(dsuaz$edad,
xlim=c(min(dsuaz$edad),max(dsuaz$edad)), probability=T, nclass=max(dsuaz$edad)-min(dsuaz$edad)+1,
col='#66477A',
main = "Positive Skewed",
xlab = "",
ylab = "",
axes=FALSE)
lines(density(dsuaz$edad,bw=1), col='#FF0038', lwd=6)
plot( x = dsuaz$promedio, y = dsuaz$hrs_dormir_dia,
main = "Scatterplot of Average of Grades vs. Hours-per-day for sleeping",
xlab = "Average of Grades", ylab = "Hours-per-day for sleeping")
plot( x = dsuaz$hrs_estudio_dia, y = dsuaz$hrs_universidad_dia,
main = "Scatterplot of Hours-per-day for Study vs. Hours-per-day for staying in university",
xlab = "Hours-per-day for Study", ylab = "Hours-per-day for staying in university")
plot( x = dsuaz$dificultad_licenciatura, y = dsuaz$pasion_licenciatura,
main = "Scatterplot of Difficulty On Career vs. Passion For Career",
xlab = "Difficulty on Career", ylab = "Passion For Career")
sprintf("Correlation coefficient: %f", cor(dsuaz$dificultad_licenciatura, dsuaz$pasion_licenciatura, use="pairwise.complete.obs"))
## [1] "Correlation coefficient: 0.050797"
sprintf("Correlation coefficient: %f", cor(dsuaz$hrs_estudio_dia, dsuaz$pasion_licenciatura, use="pairwise.complete.obs"))
## [1] "Correlation coefficient: 0.327390"
sprintf("Correlation coefficient: %f", cor(dsuaz$hrs_estudio_dia, dsuaz$promedio, use="pairwise.complete.obs"))
## [1] "Correlation coefficient: 0.152540"
# Funcion para Normalizar por min-max
minmax <- function(x) (
return (( x - min(x)) / (max(x) - min(x)))
)
# Funcion para Normalizar por z-Score
zscore <- function(x) (
return (x - mean(x) / (sd(x)))
)
dsuaz$Ssalario_estimado <- factor(dsuaz$Ssalario_estimado)
dsuaz$Ssalario_estimado <- as.numeric(dsuaz$Ssalario_estimado);
# 1- Laborar 2- Posgrada
dsuaz$planes_egreso <- factor(dsuaz$planes_egreso)
dsuaz$planes_egreso <- as.numeric(dsuaz$planes_egreso)
# 1- Hombre 2- Mujer
dsuaz$genero <- factor(dsuaz$genero)
dsuaz$genero <- as.numeric(dsuaz$genero)
dsuaz$licenciatura <- factor(dsuaz$licenciatura)
dsuaz_n <- as.data.frame(sapply(dsuaz[,c(3,5,8,13,17)], minmax))
dsuaz_zs <- as.data.frame(sapply(dsuaz[,c(3,5,8,13,17)], zscore))
<<<<<<< HEAD ###### In this scenario, we calculated knn with manhattan distance with a value of k of 17, and the algorythm get 17 correct answers over 23, that is almost same result than the knn with euclidian distance. ======= #### In this scenario, we calculated knn with manhattan distance with a value of k of 17, and the algorythm get 17 correct answers over 23, that is almost same result than the knn with euclidian distance. >>>>>>> a1f85ad8eb3dfe5e1cad846159e5864fb06b6c39
dsuaz_train <- dsuaz_n[1:90,]
dsuaz_testing <- dsuaz_n[91:113,]
dsuaz_train_labels <- dsuaz[1:90,21]
dsuaz_testing_labels <- dsuaz[91:113,21]
library(knnGarden)
## Loading required package: cluster
library(gmodels)
resultsMH <- knnVCN(dsuaz_train,dsuaz_train_labels,dsuaz_testing, K = 17, ShowObs = FALSE,method = "manhattan",p =2)
resultsMH
## TstXIBelong
## 1 Psicologia
## 2 Ingenieria de Software
## 3 Medicina Humana
## 4 Medicina Humana
## 5 Psicologia
## 6 Psicologia
## 7 Ingenieria de Software
## 8 Psicologia
## 9 Ingenieria de Software
## 10 Quimico Farmaceutico Biologo
## 11 Ingenieria de Software
## 12 Medicina Humana
## 13 Psicologia
## 14 Medicina Humana
## 15 Ingenieria de Software
## 16 Psicologia
## 17 Medicina Humana
## 18 Psicologia
## 19 Ingenieria de Software
## 20 Psicologia
## 21 Psicologia
## 22 Psicologia
## 23 Quimico Farmaceutico Biologo
dsuaz_testing_labels
## [1] Ingenieria de Software Quimico Farmaceutico Biologo
## [3] Medicina Humana Medicina Humana
## [5] Psicologia Psicologia
## [7] Ingenieria de Software Quimico Farmaceutico Biologo
## [9] Medicina Humana Quimico Farmaceutico Biologo
## [11] Ingenieria de Software Medicina Humana
## [13] Psicologia Medicina Humana
## [15] Ingenieria de Software Psicologia
## [17] Quimico Farmaceutico Biologo Psicologia
## [19] Ingenieria de Software Psicologia
## [21] Ingenieria de Software Psicologia
## [23] Quimico Farmaceutico Biologo
## 4 Levels: Ingenieria de Software Medicina Humana ... Quimico Farmaceutico Biologo
dsuaz_train_labels <- dsuaz[1:90,21]
dsuaz_testing_labels <- dsuaz[91:113,21]
table(dsuaz_train_labels)
## dsuaz_train_labels
## Ingenieria de Software Medicina Humana
## 27 25
## Psicologia Quimico Farmaceutico Biologo
## 18 20
table(dsuaz_testing_labels)
## dsuaz_testing_labels
## Ingenieria de Software Medicina Humana
## 6 5
## Psicologia Quimico Farmaceutico Biologo
## 7 5
dsuaz_train <- dsuaz_n[1:90,]
dsuaz_testing <- dsuaz_n[91:113,]
dim(dsuaz_train)
## [1] 90 5
dim(dsuaz_testing)
## [1] 23 5
dsuaz_train_zs <- dsuaz_zs[1:90,]
dsuaz_testing_zs <- dsuaz_zs[91:113,]
dim(dsuaz_train_zs)
## [1] 90 5
dim(dsuaz_testing_zs)
## [1] 23 5
library(class)
library(gmodels)
dsuaz_results <- knn(train = dsuaz_train,test = dsuaz_testing,cl = dsuaz_train_labels, k = 1)
CrossTable(x = dsuaz_testing_labels,
y = dsuaz_results,
prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 23
##
##
## | dsuaz_results
## dsuaz_testing_labels | Ingenieria de Software | Medicina Humana | Psicologia | Quimico Farmaceutico Biologo | Row Total |
## -----------------------------|------------------------------|------------------------------|------------------------------|------------------------------|------------------------------|
## Ingenieria de Software | 3 | 0 | 3 | 0 | 6 |
## | 0.500 | 0.000 | 0.500 | 0.000 | 0.261 |
## | 0.750 | 0.000 | 0.429 | 0.000 | |
## | 0.130 | 0.000 | 0.130 | 0.000 | |
## -----------------------------|------------------------------|------------------------------|------------------------------|------------------------------|------------------------------|
## Medicina Humana | 0 | 5 | 0 | 0 | 5 |
## | 0.000 | 1.000 | 0.000 | 0.000 | 0.217 |
## | 0.000 | 0.625 | 0.000 | 0.000 | |
## | 0.000 | 0.217 | 0.000 | 0.000 | |
## -----------------------------|------------------------------|------------------------------|------------------------------|------------------------------|------------------------------|
## Psicologia | 1 | 0 | 4 | 2 | 7 |
## | 0.143 | 0.000 | 0.571 | 0.286 | 0.304 |
## | 0.250 | 0.000 | 0.571 | 0.500 | |
## | 0.043 | 0.000 | 0.174 | 0.087 | |
## -----------------------------|------------------------------|------------------------------|------------------------------|------------------------------|------------------------------|
## Quimico Farmaceutico Biologo | 0 | 3 | 0 | 2 | 5 |
## | 0.000 | 0.600 | 0.000 | 0.400 | 0.217 |
## | 0.000 | 0.375 | 0.000 | 0.500 | |
## | 0.000 | 0.130 | 0.000 | 0.087 | |
## -----------------------------|------------------------------|------------------------------|------------------------------|------------------------------|------------------------------|
## Column Total | 4 | 8 | 7 | 4 | 23 |
## | 0.174 | 0.348 | 0.304 | 0.174 | |
## -----------------------------|------------------------------|------------------------------|------------------------------|------------------------------|------------------------------|
##
##
library(class)
library(gmodels)
dsuaz_results_zs <- knn(train = dsuaz_train_zs,test = dsuaz_testing_zs,cl = dsuaz_train_labels, k = 1)
CrossTable(x = dsuaz_testing_labels,
y = dsuaz_results_zs,
prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 23
##
##
## | dsuaz_results_zs
## dsuaz_testing_labels | Ingenieria de Software | Medicina Humana | Psicologia | Quimico Farmaceutico Biologo | Row Total |
## -----------------------------|------------------------------|------------------------------|------------------------------|------------------------------|------------------------------|
## Ingenieria de Software | 3 | 0 | 2 | 1 | 6 |
## | 0.500 | 0.000 | 0.333 | 0.167 | 0.261 |
## | 0.750 | 0.000 | 0.333 | 0.200 | |
## | 0.130 | 0.000 | 0.087 | 0.043 | |
## -----------------------------|------------------------------|------------------------------|------------------------------|------------------------------|------------------------------|
## Medicina Humana | 0 | 5 | 0 | 0 | 5 |
## | 0.000 | 1.000 | 0.000 | 0.000 | 0.217 |
## | 0.000 | 0.625 | 0.000 | 0.000 | |
## | 0.000 | 0.217 | 0.000 | 0.000 | |
## -----------------------------|------------------------------|------------------------------|------------------------------|------------------------------|------------------------------|
## Psicologia | 1 | 1 | 4 | 1 | 7 |
## | 0.143 | 0.143 | 0.571 | 0.143 | 0.304 |
## | 0.250 | 0.125 | 0.667 | 0.200 | |
## | 0.043 | 0.043 | 0.174 | 0.043 | |
## -----------------------------|------------------------------|------------------------------|------------------------------|------------------------------|------------------------------|
## Quimico Farmaceutico Biologo | 0 | 2 | 0 | 3 | 5 |
## | 0.000 | 0.400 | 0.000 | 0.600 | 0.217 |
## | 0.000 | 0.250 | 0.000 | 0.600 | |
## | 0.000 | 0.087 | 0.000 | 0.130 | |
## -----------------------------|------------------------------|------------------------------|------------------------------|------------------------------|------------------------------|
## Column Total | 4 | 8 | 6 | 5 | 23 |
## | 0.174 | 0.348 | 0.261 | 0.217 | |
## -----------------------------|------------------------------|------------------------------|------------------------------|------------------------------|------------------------------|
##
##
library(class)
library(gmodels)
dsuaz_results <- knn(train = dsuaz_train,test = dsuaz_testing,cl = dsuaz_train_labels, k = 20)
CrossTable(x = dsuaz_testing_labels,
y = dsuaz_results,
prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 23
##
##
## | dsuaz_results
## dsuaz_testing_labels | Ingenieria de Software | Medicina Humana | Psicologia | Quimico Farmaceutico Biologo | Row Total |
## -----------------------------|------------------------------|------------------------------|------------------------------|------------------------------|------------------------------|
## Ingenieria de Software | 4 | 0 | 1 | 1 | 6 |
## | 0.667 | 0.000 | 0.167 | 0.167 | 0.261 |
## | 0.667 | 0.000 | 0.200 | 0.143 | |
## | 0.174 | 0.000 | 0.043 | 0.043 | |
## -----------------------------|------------------------------|------------------------------|------------------------------|------------------------------|------------------------------|
## Medicina Humana | 1 | 4 | 0 | 0 | 5 |
## | 0.200 | 0.800 | 0.000 | 0.000 | 0.217 |
## | 0.167 | 0.800 | 0.000 | 0.000 | |
## | 0.043 | 0.174 | 0.000 | 0.000 | |
## -----------------------------|------------------------------|------------------------------|------------------------------|------------------------------|------------------------------|
## Psicologia | 0 | 0 | 4 | 3 | 7 |
## | 0.000 | 0.000 | 0.571 | 0.429 | 0.304 |
## | 0.000 | 0.000 | 0.800 | 0.429 | |
## | 0.000 | 0.000 | 0.174 | 0.130 | |
## -----------------------------|------------------------------|------------------------------|------------------------------|------------------------------|------------------------------|
## Quimico Farmaceutico Biologo | 1 | 1 | 0 | 3 | 5 |
## | 0.200 | 0.200 | 0.000 | 0.600 | 0.217 |
## | 0.167 | 0.200 | 0.000 | 0.429 | |
## | 0.043 | 0.043 | 0.000 | 0.130 | |
## -----------------------------|------------------------------|------------------------------|------------------------------|------------------------------|------------------------------|
## Column Total | 6 | 5 | 5 | 7 | 23 |
## | 0.261 | 0.217 | 0.217 | 0.304 | |
## -----------------------------|------------------------------|------------------------------|------------------------------|------------------------------|------------------------------|
##
##
library(class)
library(gmodels)
dsuaz_results_zs <- knn(train = dsuaz_train_zs,test = dsuaz_testing_zs,cl = dsuaz_train_labels, k = 10)
CrossTable(x = dsuaz_testing_labels,
y = dsuaz_results_zs,
prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 23
##
##
## | dsuaz_results_zs
## dsuaz_testing_labels | Ingenieria de Software | Medicina Humana | Psicologia | Quimico Farmaceutico Biologo | Row Total |
## -----------------------------|------------------------------|------------------------------|------------------------------|------------------------------|------------------------------|
## Ingenieria de Software | 4 | 0 | 2 | 0 | 6 |
## | 0.667 | 0.000 | 0.333 | 0.000 | 0.261 |
## | 0.571 | 0.000 | 0.222 | 0.000 | |
## | 0.174 | 0.000 | 0.087 | 0.000 | |
## -----------------------------|------------------------------|------------------------------|------------------------------|------------------------------|------------------------------|
## Medicina Humana | 2 | 3 | 0 | 0 | 5 |
## | 0.400 | 0.600 | 0.000 | 0.000 | 0.217 |
## | 0.286 | 0.750 | 0.000 | 0.000 | |
## | 0.087 | 0.130 | 0.000 | 0.000 | |
## -----------------------------|------------------------------|------------------------------|------------------------------|------------------------------|------------------------------|
## Psicologia | 0 | 0 | 7 | 0 | 7 |
## | 0.000 | 0.000 | 1.000 | 0.000 | 0.304 |
## | 0.000 | 0.000 | 0.778 | 0.000 | |
## | 0.000 | 0.000 | 0.304 | 0.000 | |
## -----------------------------|------------------------------|------------------------------|------------------------------|------------------------------|------------------------------|
## Quimico Farmaceutico Biologo | 1 | 1 | 0 | 3 | 5 |
## | 0.200 | 0.200 | 0.000 | 0.600 | 0.217 |
## | 0.143 | 0.250 | 0.000 | 1.000 | |
## | 0.043 | 0.043 | 0.000 | 0.130 | |
## -----------------------------|------------------------------|------------------------------|------------------------------|------------------------------|------------------------------|
## Column Total | 7 | 4 | 9 | 3 | 23 |
## | 0.304 | 0.174 | 0.391 | 0.130 | |
## -----------------------------|------------------------------|------------------------------|------------------------------|------------------------------|------------------------------|
##
##